Libraries & Setup

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggridges)
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-8
library(readr)
library(breakaway)

# Here we have made our input into smaller 100MB files and now read them in
setwd("rawdata/MetagenomicFirstRound/")
con <- pipe("cat part.gz_* | gunzip -c", "rb")
data <- read_csv(con)
## Rows: 1132002 Columns: 187
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): sample, tax_name, tax_rank, tax_path
## dbl (182): tax_id, N_reads, N_alignments, MAP_damage, MAP_significance, mean...
## lgl   (1): MAP_valid
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
close(con)
setwd("../../")
#metadata
metadata <- read.csv("metadata.csv")

# Minimum amount of damage filter
DamMin = 0.0
D_Max = 1.0
#Lambda Likelihood Ratio
LR = 0
# Minimum reads for parsing taxa
MinRead = 1
# Minimum mean read length
MinLength = 35

Metadata and seq mapping observations

Here we are looking at some values produced by fastp.

First lets look at the relationship between duplication percentage and ∆Ct (difference between the ideal number fo PCR cycles and used number of cycles). The est age of samples in BP is shown.

plot(metadata$Dct,metadata$duplication.rate,pch=16,ylab="% duplication")
abline(v=0,col="darkred",lwd=3)
text(0.2,60,labels="Underamplified", adj=0)
text(-0.2,60,labels="Overamplified", adj=1)
text(metadata$Dct,metadata$duplication.rate+1.5,labels = metadata$age,cex=0.4)

Now lets look at the low complexity proportion vs ∆Ct

plot(metadata$Dct,metadata$low.complexity,pch=16,ylab="% low complexity")
abline(v=0,col="darkred",lwd=3)
text(0.2,0.32,labels="Underamplified", adj=0)
text(-0.2,0.32,labels="Overamplified", adj=1)

Now how many post-fastp reads there are per sample.

plot(metadata$Dct,metadata$dedupReads,pch=16,ylab="Post fastp reads")
abline(v=0,col="darkred",lwd=3)
text(0.2,2.2*10^8,labels="Underamplified", adj=0)
text(-0.2,2.2*10^8,labels="Overamplified", adj=1)
text(metadata$Dct,metadata$dedupReads+5*10^6,labels = metadata$age,cex=0.4)

Age-Damage / Length relationships

First we wrangle the raw data

# Let's make an animal subset of the data 
df1 <- data %>% filter(MAP_damage > DamMin, N_reads >= MinRead, mean_L > MinLength, MAP_significance  > LR,  grepl("Metazoa",tax_path), grepl("^genus", tax_rank), grepl("", sample))
inputdata <- df1
inputdata$sample2 <- factor(inputdata$sample,levels= sort(unique(inputdata$sample),decreasing = TRUE))
inputdata$age <- metadata$age[match(inputdata$sample2,metadata$Sample)]
inputdata$site <- as.factor(metadata$CoreID[match(inputdata$sample2,metadata$Sample)])

raw.metazoa <- inputdata[inputdata$N_reads>99,]
raw.metazoa.50 <- inputdata[inputdata$N_reads>49,]
raw.metazoa.1  <- inputdata

raw.metazoa.50.wide <- raw.metazoa.50 %>%
  pivot_wider(
    id_cols = tax_name,         # Rows will be `tax_name`
    names_from = sample,        # Columns will be `sample`
    values_from = N_reads,      # Observations will be `N_reads`
    values_fill = 0             # Fill missing values with 0
  )


raw.metazoa.1.wide <- raw.metazoa.1 %>%
  pivot_wider(
    id_cols = tax_name,         # Rows will be `tax_name`
    names_from = sample,        # Columns will be `sample`
    values_from = N_reads,      # Observations will be `N_reads`
    values_fill = 0             # Fill missing values with 0
  )


# plants
df1 <- data %>% filter(MAP_damage > DamMin, N_reads >= MinRead, mean_L > MinLength, MAP_significance  > LR,  grepl("Viridiplantae",tax_path), grepl("^genus", tax_rank), grepl("", sample))
inputdata <- df1
inputdata$sample2 <- factor(inputdata$sample,levels= sort(unique(inputdata$sample),decreasing = TRUE))
inputdata$age <- metadata$age[match(inputdata$sample2,metadata$Sample)]
inputdata$site <- as.factor(metadata$CoreID[match(inputdata$sample2,metadata$Sample)])

raw.plants <- inputdata[inputdata$N_reads>99,]
raw.plants.50 <- inputdata[inputdata$N_reads>49,]

raw.plants.50.wide <- raw.plants.50 %>%
  pivot_wider(
    id_cols = tax_name,         # Rows will be `tax_name`
    names_from = sample,        # Columns will be `sample`
    values_from = N_reads,      # Observations will be `N_reads`
    values_fill = 0             # Fill missing values with 0
  )


# bacteria 

df1 <- data %>% filter(MAP_damage > DamMin, N_reads >= MinRead, mean_L > MinLength, MAP_significance  > LR,  grepl("Bacteria",tax_path), grepl("^genus", tax_rank), grepl("", sample))
inputdata <- df1
inputdata$sample2 <- factor(inputdata$sample,levels= sort(unique(inputdata$sample),decreasing = TRUE))
inputdata$age <- metadata$age[match(inputdata$sample2,metadata$Sample)]
inputdata$site <- as.factor(metadata$CoreID[match(inputdata$sample2,metadata$Sample)])

raw.bacteria <- inputdata[inputdata$N_reads>99,]
raw.bacteria.50<- inputdata[inputdata$N_reads>49,]

raw.bacteria.50.wide <- raw.bacteria.50 %>%
  pivot_wider(
    id_cols = tax_name,         # Rows will be `tax_name`
    names_from = sample,        # Columns will be `sample`
    values_from = N_reads,      # Observations will be `N_reads`
    values_fill = 0             # Fill missing values with 0
  )

Now we make some plots, first the animals

ggplot(raw.metazoa, aes(x = MAP_damage, y = sample2, group = sample2)) +
  geom_density_ridges2(scale = 3, alpha = 0.55) +  # Plot density ridges first
  geom_point(position = position_jitter(height = 0.1, width = 0), alpha = 0.6) + # Add points above ridges
  facet_wrap(~ site, scales = "free_y") + # Split into facets by coreID
  labs(y = "Samples (ordered by age)", x = "MAP Damage",title = "Metazoa MAP Damage")+
  geom_text(aes(x = max(MAP_damage), label = age), hjust = -0.1, size = 3,nudge_y = 0.4)
## Picking joint bandwidth of 0.00621
## Picking joint bandwidth of 0.0376
## Picking joint bandwidth of 0.0317
## Picking joint bandwidth of 0.042
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_text()`).

ggplot(raw.metazoa, aes(x = mean_L, y = sample2, group = sample2)) +
  geom_density_ridges2(scale = 3, alpha = 0.55) +  # Plot density ridges first
  geom_point(position = position_jitter(height = 0.1, width = 0), alpha = 0.6) + # Add points above ridges
  facet_wrap(~ site, scales = "free_y") + # Split into facets by coreID
  labs(y = "Samples (ordered by age)", x = "MAP Damage",title = "Metazoa Mean Length")+
  geom_text(aes(x = max(MAP_damage), label = age), hjust = -0.1, size = 3,nudge_y = 0.4)
## Picking joint bandwidth of 4.61
## Picking joint bandwidth of 3.38
## Picking joint bandwidth of 3.55
## Picking joint bandwidth of 3.35
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_text()`).

Now bacteria

ggplot(raw.bacteria, aes(x = MAP_damage, y = sample2, group = sample2)) +
  geom_density_ridges2(scale = 3, alpha = 0.55) +  # Plot density ridges first
  geom_point(position = position_jitter(height = 0.1, width = 0), alpha = 0.6) + # Add points above ridges
  facet_wrap(~ site, scales = "free_y") + # Split into facets by coreID
  labs(y = "Samples (ordered by age)", x = "MAP Damage",title = "Bacteria MAP Damage")+
  geom_text(aes(x = max(MAP_damage), label = age), hjust = -0.1, size = 3,nudge_y = 0.4)
## Picking joint bandwidth of 0.0026
## Picking joint bandwidth of 0.00941
## Picking joint bandwidth of 0.0166
## Picking joint bandwidth of 0.0342
## Warning: Removed 33 rows containing missing values or values outside the scale range
## (`geom_text()`).

ggplot(raw.bacteria, aes(x = mean_L, y = sample2, group = sample2)) +
  geom_density_ridges2(scale = 3, alpha = 0.55) +  # Plot density ridges first
  geom_point(position = position_jitter(height = 0.1, width = 0), alpha = 0.6) + # Add points above ridges
  facet_wrap(~ site, scales = "free_y") + # Split into facets by coreID
  labs(y = "Samples (ordered by age)", x = "MAP Damage",title = "Bacteria Mean Length")+
  geom_text(aes(x = max(MAP_damage), label = age), hjust = -0.1, size = 3,nudge_y = 0.4)
## Picking joint bandwidth of 7.26
## Picking joint bandwidth of 2.18
## Picking joint bandwidth of 2.83
## Picking joint bandwidth of 3.31
## Warning: Removed 33 rows containing missing values or values outside the scale range
## (`geom_text()`).

Now we make some plots, first the animals

ggplot(raw.plants, aes(x = MAP_damage, y = sample2, group = sample2)) +
  geom_density_ridges2(scale = 3, alpha = 0.55) +  # Plot density ridges first
  geom_point(position = position_jitter(height = 0.1, width = 0), alpha = 0.6) + # Add points above ridges
  facet_wrap(~ site, scales = "free_y") + # Split into facets by coreID
  labs(y = "Samples (ordered by age)", x = "MAP Damage",title = "Plants MAP Damage")+
  geom_text(aes(x = max(MAP_damage), label = age), hjust = -0.1, size = 3,nudge_y = 0.4)
## Picking joint bandwidth of NaN
## Picking joint bandwidth of 0.027
## Picking joint bandwidth of 0.0257
## Picking joint bandwidth of 0.0324
## Warning in FUN(X[[i]], ...): no non-missing arguments to max; returning -Inf
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_text()`).

ggplot(raw.plants, aes(x = mean_L, y = sample2, group = sample2)) +
  geom_density_ridges2(scale = 3, alpha = 0.55) +  # Plot density ridges first
  geom_point(position = position_jitter(height = 0.1, width = 0), alpha = 0.6) + # Add points above ridges
  facet_wrap(~ site, scales = "free_y") + # Split into facets by coreID
  labs(y = "Samples (ordered by age)", x = "MAP Damage",title = "Plants Mean Length")+
  geom_text(aes(x = max(MAP_damage), label = age), hjust = -0.1, size = 3,nudge_y = 0.4)
## Picking joint bandwidth of NaN
## Picking joint bandwidth of 4.2
## Picking joint bandwidth of 3.41
## Picking joint bandwidth of 4.55
## Warning in FUN(X[[i]], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(X[[i]], ...): Removed 5 rows containing missing values or values outside the scale range
## (`geom_text()`).

Richness Estimates

Now let’s look at some richness estimates per core

richness <- raw.metazoa.50.wide %>%
  select(-tax_name) %>%
  summarise(across(everything(), ~ sum(. > 0))) %>%
  pivot_longer(cols = everything(), names_to = "Sample", values_to = "Richness")

# Combine richness with metadata
richness_plot_data <- richness %>%
  left_join(metadata, by = "Sample")


ggplot(richness_plot_data, aes(x = age, y = Richness, color = CoreID)) +
  geom_point(size = 4) +
  theme_minimal() +
  labs(
    title = "Metazoan Genus Richness vs. Age by Core (50 reads)",
    x = "Age",
    y = "Richness",
    color = "Core ID"
  ) +
  theme(
    plot.title = element_text(size = 16, hjust = 0.5),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14),
    legend.title = element_text(size = 12)
  )
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

## More reads in the earlier core portion - lets use breakaway estimates 

results <- breakaway_nof1(raw.metazoa.1.wide[,-1])
## Warning in breakaway_nof1.data.frame(., output, plot, answers, print): Assuming
## taxa are rows
named_estimates <- map_dbl(results, ~ .x$estimate)
names(named_estimates) <- names(results)  

diversity_df <- tibble(
  Sample = names(named_estimates),
  Richness = as.numeric(named_estimates)
)

# Merge with metadata
plot_data <- diversity_df %>%
  left_join(metadata, by = "Sample")

# Create the plot
ggplot(plot_data, aes(x = age, y = Richness, color = CoreID)) +
  geom_point(size = 4) +
  theme_minimal() +
  labs(
    title = "Metazoan Genus Richness (breakaway estimate) vs. Age by Core (50 reads)",
    x = "Age",
    y = "Richness",
    color = "Core ID"
  ) +
  theme(
    plot.title = element_text(size = 16, hjust = 0.5),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14),
    legend.title = element_text(size = 12)
  )
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Analyze and plot for raw.plants.50.wide
richness_plants <- raw.plants.50.wide %>%
  select(-tax_name) %>%
  summarise(across(everything(), ~ sum(. > 0))) %>%
  pivot_longer(cols = everything(), names_to = "Sample", values_to = "Richness")

richness_plot_data_plants <- richness_plants %>%
  left_join(metadata, by = "Sample")

ggplot(richness_plot_data_plants, aes(x = age, y = Richness, color = CoreID)) +
  geom_point(size = 4) +
  theme_minimal() +
  labs(
    title = "Plant Genus Richness vs. Age by Core (50 reads)",
    x = "Age",
    y = "Richness",
    color = "Core ID"
  ) +
  theme(
    plot.title = element_text(size = 16, hjust = 0.5),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14),
    legend.title = element_text(size = 12)
  )
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Analyze and plot for raw.bacteria.50.wide
richness_bacteria <- raw.bacteria.50.wide %>%
  select(-tax_name) %>%
  summarise(across(everything(), ~ sum(. > 0))) %>%
  pivot_longer(cols = everything(), names_to = "Sample", values_to = "Richness")

richness_plot_data_bacteria <- richness_bacteria %>%
  left_join(metadata, by = "Sample")

ggplot(richness_plot_data_bacteria, aes(x = age, y = Richness, color = CoreID)) +
  geom_point(size = 4) +
  theme_minimal() +
  labs(
    title = "Bacterial Genus Richness vs. Age by Core (50 reads)",
    x = "Age",
    y = "Richness",
    color = "Core ID"
  ) +
  theme(
    plot.title = element_text(size = 16, hjust = 0.5),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14),
    legend.title = element_text(size = 12)
  )
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).